#!/usr/bin/perl -w

if ($#ARGV != 2 ) {
	print "usage: create3Lwordsdataset.pl LANG1 LANG2 WORDFILE\n";
	exit;
}

$l1="$ARGV[0]";
$l2="$ARGV[1]";
$wordfile="$ARGV[2]";

$src_test_dir = 'Test-en';
$test_dir_l1 = "Test-en-$l1";
$test_dir_l2 = "Test-en-$l2";
$dest_test_dir = "Test-en-$l1-$l2";

print "Result will be written in $dest_test_dir\n";

mkdir $dest_test_dir;
	
open WORDFILE, "<$wordfile";
( -r WORDFILE ) || die "Could not open file $wordfile\n";
@words = <WORDFILE>;
chomp(@words);

chdir("$dest_test_dir"); 

# For each word in the words file
foreach $word (@words) {

	# Move into directory of the current word
	mkdir $word;
	chdir("$word");

	print "Processing $word..\n";

	$corpusfile_en = "../../$src_test_dir/$word/$word-corpus.xml";  # file which line are used as sentences search pattern
	$corpusfile_l1 = "../../$test_dir_l1/$word/$word-corpus.xml";	# corpus in en-l1 languages
	$corpusfile_l2 = "../../$test_dir_l2/$word/$word-corpus.xml";	# corpus in en-l2 languages

	$outputfile = "$word-td.txt";									# file used to write test data

	# Open files
	open WORDCORPUSFILE_en, "<$corpusfile_en";
	open WORDCORPUSFILE_l1, "<$corpusfile_l1";
	open WORDCORPUSFILE_l2, "<$corpusfile_l2";

	open (OUTPUTFILE, ">> $outputfile") || die "Could not open file: $!\n";

	# Verify if files are correctly opened
	( -r WORDCORPUSFILE_en ) || die "Cannot open $corpusfile_en\n";
	( -r WORDCORPUSFILE_l1 ) || die "Cannot open $corpusfile_l1\n";
	( -r WORDCORPUSFILE_l2 ) || die "Cannot open $corpusfile_l2\n";

	@file_enT = <WORDCORPUSFILE_en>;
	@file_l1 = <WORDCORPUSFILE_l1>;
	@file_l2 = <WORDCORPUSFILE_l2>;

	chomp(@file_enT);
	chomp(@file_l1);
	chomp(@file_l2);
	
	# select only lines not containing tags
	@file_en = grep (!/<.*instance.*>|<.*context.*>|<.*lexelt.*>|<.*corpus.*>/ , @file_enT);
	
	# For each line of in the english word corpus
	foreach $line ( @file_en ){
		
		chomp($line);

		@match_l1 = grep /\Q$line\E/, @file_l1;
		@match_l2 = grep /\Q$line\E/, @file_l2;

		if(@match_l2){
			$match_l2[0] =~ s/\Q($line)(.*$)\E/$2/;
		}

		if ( @match_l1 && @match_l2 ){
			print OUTPUTFILE "$match_l1[0] . $match_l2[0]\n";
		}

	}
	close OUTPUTFILE;
	close WORDCORPUSFILE_l1;
	close WORDCORPUSFILE_l2;
	close WORDCORPUSFILE_en;
	chdir("..");
}